Neve, Viva and Yaohan work together on the scripts, and then finish the write-up separately.
# Read and process Chicago boundary data
chicagoBoundary <-
st_read(file.path(root.dir, "/Chapter5/chicagoBoundary.geojson")) %>% # Read Chicago boundary data
st_transform('ESRI:102271') # Transform coordinate reference system
# Chicago Neighborhoods
neighborhoods <-
st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
st_transform('ESRI:102271')
# Read and process police districts data
policeDistricts <-
st_read("https://data.cityofchicago.org/api/geospatial/fthy-xz3r?method=export&format=GeoJSON") %>%
st_transform('ESRI:102271') %>% # Transform coordinate reference system
dplyr::select(District = dist_num) # Select only the district number, renaming it to 'District'
# Read and process damagelaries data
crimes2018 <-
read.socrata("https://data.cityofchicago.org/resource/3i3m-jwuy.json")
## rank the crimes by the count of each type
crimes2018.asdf <- crimes2018 %>%
group_by(primary_type, description) %>%
tally() %>%
arrange(desc(n))
## choose CRIMINAL DAMAGE TO PROPERTY as dependent variable within the boundary
damage2018 <- crimes2018 %>%
filter(primary_type == "CRIMINAL DAMAGE" & description == "TO PROPERTY") %>%
na.omit() %>%
st_as_sf(coords = c("location.longitude", "location.latitude"), crs = 4326, agr = "constant") %>%
st_transform('ESRI:102271') %>%
st_intersection(chicagoBoundary) %>%
distinct()
# mapping crime data in dots and in the fishnet
fishnet <-
st_make_grid(chicagoBoundary,
cellsize = 500,
square = TRUE) %>%
.[chicagoBoundary] %>% # fast way to select intersecting polygons
st_sf() %>% mutate(uniqueID = 1:n())
crime_net <-
dplyr::select(damage2018) %>%
mutate(count.damage = 1) %>%
aggregate(., fishnet, sum) %>%
mutate(count.damage = replace_na(count.damage, 0),
uniqueID = 1:n(),
cvID = sample(round(nrow(fishnet) / 24),
size=nrow(fishnet), replace = TRUE))
ggplot() +
geom_sf(data = chicagoBoundary) + # Add Chicago boundary
geom_sf(data = damage2018, colour = "red", size = 0.005, show.legend = "point") +
labs(title = "Criminal Damage, 2018 Chicago") +
theme_void() +
theme(plot.title = element_text(size = 12))+
ggplot() +
geom_sf(data = crime_net, aes(fill = count.damage), color = NA) +
scale_fill_viridis("Count of Criminal Damage") +
labs(title = "Fishnet of Criminal Damage by Count") +
theme_void()+
theme(plot.title = element_text(size = 12))
Variable 1: Abandoned Cars reason of the choice Variable 2: Abandoned Buildings reason of the choice Variable 3: Graffiti Variable 4: Disfunctional Street Lights Variable 5: Sanitation Complaints Variable 6: Liquor Retail Variable 7: Park Variable 8: Environmental Complaints
# load variable
abandonCars <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Abandoned-Vehicles/3c9v-pnva") %>%
# Extract the year from the creation date and filter for the year 2017
mutate(year = substr(creation_date, 1, 4)) %>% filter(year == "2018") %>%
# Select latitude and longitude columns and remove rows with missing values
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
# Convert to simple feature (sf) object with geographic coordinates
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
# Transform coordinates to match the coordinate reference system (CRS) of the fishnet
st_transform(st_crs(fishnet)) %>%
# Add a legend label indicating abandoned cars
mutate(Legend = "Abandoned_Cars")
abandonBuildings <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Vacant-and-Abandoned-Building/7nii-7srd") %>%
mutate(year = substr(date_service_request_was_received,1,4)) %>%
filter(year == "2018") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Abandoned_Buildings")
graffiti <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Graffiti-Removal-Historical/hec5-y4x5") %>%
mutate(year = substr(creation_date,1,4)) %>% filter(year == "2018") %>%
filter(where_is_the_graffiti_located_ %in% c("Front", "Rear", "Side")) %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Graffiti")
streetLightsOut <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Street-Lights-All-Out/zuxi-7xem") %>%
mutate(year = substr(creation_date,1,4)) %>% filter(year == "2018") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Street_Lights_Out")
sanitation <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints-Hi/me59-5fac") %>%
mutate(year = substr(creation_date,1,4)) %>% filter(year == "2018") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Sanitation")
liquorRetail <-
read.socrata("https://data.cityofchicago.org/resource/nrmj-3kcf.json") %>%
filter(business_activity == "Retail Sales of Packaged Liquor") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Liquor_Retail")
park <-
read.socrata("https://data.cityofchicago.org/resource/eix4-gf83.json") %>%
select(X = x_coord, Y = y_coord) %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Park")
environ.complaint <-
read.socrata("https://data.cityofchicago.org/resource/fypr-ksnz.json")
environ.complaint <- environ.complaint %>%
mutate(year = substr(complaint_date,1,4)) %>% filter(year == "2018") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Environmental_Complaint")
# add variables to the fishnet
vars_net <-
rbind(abandonCars,streetLightsOut,abandonBuildings,
liquorRetail, graffiti, sanitation, park, environ.complaint) %>% #add on
st_join(., fishnet, join=st_within) %>%
st_drop_geometry() %>%
group_by(uniqueID, Legend) %>%
summarize(count = n()) %>%
full_join(fishnet) %>%
spread(Legend, count, fill=0) %>%
st_sf() %>%
dplyr::select(-`<NA>`) %>%
na.omit() %>%
ungroup()
vars_net.long <- vars_net %>%
gather(Variable, value, -geometry, -uniqueID) %>%
na.omit()
# mapping risk factors
vars_risk <- unique(vars_net.long$Variable)
mapList <- list()
for(i in vars_risk)
{mapList[[i]] <-
ggplot() +
geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
theme_void()}
do.call(grid.arrange,c(mapList, ncol=3, top="Risk Factors by Fishnet"))
vars_net <-
vars_net %>%
mutate(
Abandoned_Buildings.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(abandonBuildings),3),
Abandoned_Cars.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(abandonCars),3),
Graffiti.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(graffiti),3),
Liquor_Retail.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(liquorRetail),3),
Street_Lights_Out.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(streetLightsOut),3),
Park.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(park),3),
Sanitation.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(sanitation),3),
Environmental_Complaint.nn =
nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(environ.complaint),3))
nn.vars_net.long <-
dplyr::select(vars_net, ends_with(".nn")) %>%
gather(Variable, value, -geometry)
nn.vars <- unique(nn.vars_net.long$Variable)
nn.mapList <- list()
for(i in nn.vars){
nn.mapList[[i]] <-
ggplot() +
geom_sf(data = filter(nn.vars_net.long, Variable == i), aes(fill=value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
theme_void()+
theme(plot.title = element_text(size = 10))}
do.call(grid.arrange,c(nn.mapList, ncol=3, top="Nearest Neighbor Risk Factors by Fishnet"))
# add spatial factor
final_net <- final_net %>%
mutate(damage.isSig =
ifelse(damage.localMorans$hotspot == 1, 1, 0)) %>%
mutate(damage.isSig.dist =
nn_function(st_coordinates(st_centroid(final_net)),
st_coordinates(st_centroid(filter(final_net,
damage.isSig == 1))),
k = 1))
# calculate correlations
correlation.long <-
st_drop_geometry(final_net) %>%
dplyr::select(-uniqueID, -cvID, -name, -District) %>%
gather(Variable, Value, -count.damage)
correlation.cor <-
correlation.long %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, count.damage, use = "complete.obs"))
ggplot(correlation.long, aes(Value, count.damage)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
x = -Inf, y = Inf, vjust = 1.5, hjust = -0.1) +
geom_smooth(method = "lm", se = FALSE, colour = "red", size = 1) +
facet_wrap(~Variable, ncol = 4, scales="free") +
labs(title = "Criminal Damage Count as a Function of Risk Factors") +
theme_bw()+
theme(plot.title = element_text(size = 10),
strip.text = element_text(size = 10))
This histogram suggests an OLS regression is not appropriate method of analysis.
ggplot(final_net, aes(x = count.damage)) +
geom_histogram(binwidth = 3, fill = "grey", color = "black") +
labs(title = "Distribution of Criminal Damage Counts, Chicago", x = "Damage Count", y = "Frequency") +
theme_minimal()
## define the variables we want
reg.vars <- c(nn.vars)
reg.ss.vars <- c(nn.vars, "damage.isSig", "damage.isSig.dist")
## RUN REGRESSIONS
reg.cv <- crossValidate(
dataset = final_net,
id = "cvID",
dependentVariable = "count.damage",
indVariables = reg.vars) %>%
dplyr::select(cvID = cvID, count.damage, Prediction, geometry)
reg.ss.cv <- crossValidate(
dataset = final_net,
id = "cvID",
dependentVariable = "count.damage",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = cvID, count.damage, Prediction, geometry)
reg.spatialCV <- crossValidate(
dataset = final_net,
id = "name",
dependentVariable = "count.damage",
indVariables = reg.vars) %>%
dplyr::select(cvID = name, count.damage, Prediction, geometry)
reg.ss.spatialCV <- crossValidate(
dataset = final_net,
id = "name",
dependentVariable = "count.damage",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = name, count.damage, Prediction, geometry)
# calculate error
reg.summary <-
rbind(
mutate(reg.cv, Error = Prediction - count.damage,
Regression = "Random k-fold CV: Just Risk Factors"),
mutate(reg.ss.cv, Error = Prediction - count.damage,
Regression = "Random k-fold CV: Spatial Process"),
mutate(reg.spatialCV, Error = Prediction - count.damage,
Regression = "Spatial LOGO-CV: Just Risk Factors"),
mutate(reg.ss.spatialCV, Error = Prediction - count.damage,
Regression = "Spatial LOGO-CV: Spatial Process")) %>%
st_sf()
error_by_reg_and_fold <-
reg.summary %>%
group_by(Regression, cvID) %>%
summarize(Mean_Error = mean(Prediction - count.damage, na.rm = T),
MAE = abs(Mean_Error), na.rm = T) %>%
ungroup()
#remove afterwards
error_by_reg_and_fold %>%
ggplot(aes(MAE)) +
geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
facet_wrap(~Regression) +
geom_vline(xintercept = 0) + scale_x_continuous(breaks = seq(0, 8, by = 1)) +
labs(title="Distribution of MAE", subtitle = "k-fold cross validation vs. LOGO-CV",
x="Mean Absolute Error", y="Count") +
theme_bw()
error_by_reg_and_fold %>%
filter(str_detect(Regression, "k-fold")) %>%
ggplot() +
geom_sf(aes(fill = MAE)) +
facet_wrap(~Regression) +
scale_fill_viridis() +
labs(title = "Damage errors by k-fold Regression") +
theme_void()
error_by_reg_and_fold %>%
filter(str_detect(Regression, "LOGO")) %>%
ggplot() +
geom_sf(aes(fill = MAE)) +
facet_wrap(~Regression) +
scale_fill_viridis() +
labs(title = "Damage errors by LOGO-CV Regression") +
theme_void()
st_drop_geometry(error_by_reg_and_fold) %>%
group_by(Regression) %>%
summarize(Mean_MAE = round(mean(MAE), 2),
SD_MAE = round(sd(MAE), 2)) %>%
kable() %>%
kable_styling("striped", "hover")
| Regression | Mean_MAE | SD_MAE |
|---|---|---|
| Random k-fold CV: Just Risk Factors | 0.95 | 0.83 |
| Random k-fold CV: Spatial Process | 0.81 | 0.76 |
| Spatial LOGO-CV: Just Risk Factors | 2.07 | 1.87 |
| Spatial LOGO-CV: Spatial Process | 1.57 | 1.57 |
tracts18 <-
get_acs(geography = "tract", variables = c("B01001_001E","B01001A_001E"),
year = 2018, state=17, county=031, geometry=T) %>%
st_transform('ESRI:102271') %>%
dplyr::select(variable, estimate, GEOID) %>%
spread(variable, estimate) %>%
rename(TotalPop = B01001_001,
NumberWhites = B01001A_001) %>%
mutate(percentWhite = NumberWhites / TotalPop,
raceContext = ifelse(percentWhite > .5, "Majority_White", "Majority_Non_White")) %>%
.[neighborhoods,]
reg.summary %>%
st_centroid() %>%
st_join(tracts18) %>%
na.omit() %>%
st_drop_geometry() %>%
group_by(Regression, raceContext) %>%
summarize(mean.Error = mean(Error, na.rm = T)) %>%
spread(raceContext, mean.Error) %>%
kable(caption = "Mean Error by neighborhood racial context, 2018") %>%
kable_styling("striped", "hover")
| Regression | Majority_Non_White | Majority_White |
|---|---|---|
| Random k-fold CV: Just Risk Factors | -0.8912638 | 0.9710527 |
| Random k-fold CV: Spatial Process | -0.4063060 | 0.4393822 |
| Spatial LOGO-CV: Just Risk Factors | -0.9661727 | 1.0010598 |
| Spatial LOGO-CV: Spatial Process | -0.4408509 | 0.4422612 |
damage_ppp <- as.ppp(st_coordinates(damage2018), W = st_bbox(final_net))
damage_KD.1000 <- density.ppp(damage_ppp, 1000)
damage_KD.1500 <- density.ppp(damage_ppp, 1500)
damage_KD.2000 <- density.ppp(damage_ppp, 2000)
damage_KD.df <- rbind(
mutate(data.frame(rasterToPoints(mask(raster(damage_KD.1000), as(neighborhoods, 'Spatial')))), Legend = "1000 m"),
mutate(data.frame(rasterToPoints(mask(raster(damage_KD.1500), as(neighborhoods, 'Spatial')))), Legend = "1500 m"),
mutate(data.frame(rasterToPoints(mask(raster(damage_KD.2000), as(neighborhoods, 'Spatial')))), Legend = "2000 m"))
damage_KD.df$Legend <- factor(damage_KD.df$Legend, levels = c("1000 m", "1500 m", "2000 m"))
ggplot(data=damage_KD.df, aes(x=x, y=y)) +
geom_raster(aes(fill=layer)) +
facet_wrap(~Legend) +
coord_sf(crs=st_crs(final_net)) +
scale_fill_viridis(name="Density") +
labs(title = "Kernel density with 3 different search radii") +
theme_void()+
theme(legend.position = "right",
plot.title = element_text(size = 12))
# load 2019 data
damage19 <-
read.socrata("https://data.cityofchicago.org/resource/w98m-zvie.json")
# select crime data
damage19 <- damage19 %>%
filter(primary_type == "CRIMINAL DAMAGE" & description == "TO PROPERTY") %>%
na.omit() %>% # Remove rows with missing values
st_as_sf(coords = c("location.longitude", "location.latitude"), crs = 4326, agr = "constant") %>% # Convert to sf object with specified CRS
st_transform('ESRI:102271') %>% # Transform coordinate reference system
distinct() %>% # Keep only distinct geometries
.[fishnet,]
# add risk category
damage_KDE_sf <- as.data.frame(damage_KD.1000) %>%
st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
aggregate(., final_net, mean) %>%
mutate(label = "Kernel Density",
Risk_Category = ntile(value, 100),
Risk_Category = case_when(
Risk_Category >= 90 ~ "90% to 100%",
Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
cbind(
aggregate(
dplyr::select(damage19) %>% mutate(damageCount = 1), ., sum) %>%
mutate(damageCount = replace_na(damageCount, 0))) %>%
dplyr::select(label, Risk_Category, damageCount)
damage_risk_sf <-
filter(reg.summary, Regression == "Spatial LOGO-CV: Spatial Process") %>%
mutate(label = "Risk Predictions",
Risk_Category = ntile(Prediction, 100),
Risk_Category = case_when(
Risk_Category >= 90 ~ "90% to 100%",
Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
cbind(
aggregate(
dplyr::select(damage19) %>% mutate(damageCount = 1), ., sum) %>%
mutate(damageCount = replace_na(damageCount, 0))) %>%
dplyr::select(label,Risk_Category, damageCount)
rbind(damage_KDE_sf, damage_risk_sf) %>%
na.omit() %>%
gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
ggplot() +
geom_sf(aes(fill = Risk_Category), colour = NA) +
geom_sf(data = sample_n(damage19, 3000), size = .1, colour = "black") +
facet_wrap(~label, ) +
scale_fill_viridis(discrete = TRUE) +
labs(title="Comparison of Kernel Density and Risk Predictions",
subtitle="2018 criminal risk predictions; 2019 criminal damage") +
mapTheme()
rbind(damage_KDE_sf, damage_risk_sf) %>%
st_set_geometry(NULL) %>% na.omit() %>%
gather(Variable, Value, -label, -Risk_Category) %>%
group_by(label, Risk_Category) %>%
summarize(count.theft = sum(Value)) %>%
ungroup() %>%
group_by(label) %>%
mutate(Rate_of_test_set_crimes = count.theft / sum(count.theft)) %>%
ggplot(aes(Risk_Category,Rate_of_test_set_crimes)) +
geom_bar(aes(fill=label), position="dodge", stat="identity") +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Risk prediction vs. Kernel density, 2019 criminal damage") +
plotTheme() + theme(axis.text.x = element_text(angle = 45, vjust = 0.5))